The COVID-19 pandemic has seemingly touched all aspects of life, and especially the world economy. It is widely believed that because of the pandemic, the real estate market in urban areas have seen plummeting prices and demand, due to the fears of associating in high density spaces such as cities. I decided to put this assumption to the test using a city I though would be a prominent example, New York City. Using daily data from November 2019 to November 2020, I sought to explore any affects that heightened coronavirus cases would have on property sales. I wanted to see if as the pandemic got worse in certain areas prices would go down.
import plotly.express as px
import urllib.request
import pandas as pd
from plotly.graph_objs import Scatter, Figure, Layout
import plotly
import plotly.graph_objs as go
import json
import numpy as np
#Dataframe Importation from CSVs
def read_geojson(url):
with urllib.request.urlopen(url) as url:
jdata = json.loads(url.read().decode())
return jdata
Property_csv = pd.read_csv('Property_Data.csv')
Corona_csv = pd.read_csv('Corona_Virus_Daily.csv')
Population_csv = pd.read_csv('Borough_Pop - Sheet1.csv')
Property_csv['date_joined'] = Property_csv['SALE DATE'].astype('datetime64[ns]')
Property_csv['date_joined_str'] = Property_csv['date_joined'] .astype(str)
#Getting the month and year as a string
Property_csv['month_year_sold'] = pd.DatetimeIndex(Property_csv['date_joined']).strftime(" %B %Y").astype(str)
#Property_csv.columns
Property_csv["ZIP CODE"]= Property_csv["ZIP CODE"].astype(str)
#Get rid of Properties in Tax Class 4 as they are non-residential
Property_csv = Property_csv[Property_csv["TAX CLASS AT PRESENT"] !='4']
#GEOJSON
NY_url="https://raw.githubusercontent.com/fedhere/PUI2015_EC/master/mam1612_EC/nyc-zip-code-tabulation-areas-polygons.geojson"
NY_Data=read_geojson(NY_url)
#Melt Dataframe into 3 columns
Corona_Melt=pd.melt(Corona_csv, id_vars=['date_of_interest'], value_vars=['CASE_COUNT', 'PROBABLE_CASE_COUNT',
'HOSPITALIZED_COUNT', 'DEATH_COUNT', 'PROBABLE_DEATH_COUNT',
'CASE_COUNT_7DAY_AVG', 'ALL_CASE_COUNT_7DAY_AVG', 'HOSP_COUNT_7DAY_AVG',
'DEATH_COUNT_7DAY_AVG', 'ALL_DEATH_COUNT_7DAY_AVG', 'BX_CASE_COUNT',
'BX_HOSPITALIZED_COUNT', 'BX_DEATH_COUNT', 'BX_CASE_COUNT_7DAY_AVG',
'BX_HOSPITALIZED_COUNT_7DAY_AVG', 'BX_DEATH_COUNT_7DAY_AVG',
'BK_CASE_COUNT', 'BK_HOSPITALIZED_COUNT', 'BK_DEATH_COUNT',
'BK_CASE_COUNT_7DAY_AVG', 'BK_HOSPITALIZED_COUNT_7DAY_AVG',
'BK_DEATH_COUNT_7DAY_AVG', 'MN_CASE_COUNT', 'MN_HOSPITALIZED_COUNT',
'MN_DEATH_COUNT', 'MN_CASE_COUNT_7DAY_AVG',
'MN_HOSPITALIZED_COUNT_7DAY_AVG', 'MN_DEATH_COUNT_7DAY_AVG',
'QN_CASE_COUNT', 'QN_HOSPITALIZED_COUNT', 'QN_DEATH_COUNT',
'QN_CASE_COUNT_7DAY_AVG', 'QN_HOSPITALIZED_COUNT_7DAY_AVG',
'QN_DEATH_COUNT_7DAY_AVG', 'SI_CASE_COUNT', 'SI_HOSPITALIZED_COUNT',
'SI_DEATH_COUNT', 'SI_CASE_COUNT_7DAY_AVG',
'SI_HOSPITALIZED_COUNT_7DAY_AVG', 'SI_DEATH_COUNT_7DAY_AVG'],
var_name='Type', value_name='Number_Of_Cases')
#Function to retrieve the borough
def Get_Borough_From_Column(s):
split_name= s["Type"].split("_") #SPlit by underscore
if split_name[0] == "SI":
return "Staten Island"
elif split_name[0] == "BX":
return "Bronx"
elif split_name[0] == "BK":
return "Brooklyn"
elif split_name[0] == "MN":
return "Manhattan"
elif split_name[0] == "QN":
return "Queens"
else:
return split_name[0]
#Apply this to the column
Corona_Melt["Borough"]=Corona_Melt.apply(Get_Borough_From_Column,axis=1) #Borough Apply COlumn
#Merge Pop Numbers
Corona_Melt = pd.merge(Corona_Melt, Population_csv, how='left', left_on=['Borough']
, right_on = ['Borough'])
def Get_Type_of_Count(s):
split_name= s["Type"].split("_")
return str(split_name[1:])
Corona_Melt["Type_Of_Count"]=Corona_Melt.apply(Get_Type_of_Count,axis=1) #Borough Apply COlumn
Corona_Melt
Corona_Melt["Percentage of Borough Pop Inflicted"] = Corona_Melt["Number_Of_Cases"]/Corona_Melt["Population"]
Corona_Melt["Percentage of Borough Pop Inflicted"]
I incorporated data from numerous sources to help construct this project. For my Corona virus data, I used official data gathered by NYC.gov on daily cases and deaths from each borough.
The property sales data was sourced from the NYC.gov Department of Finance site and covers every non-utility property that sold in the past twelve months by borough and postal code.
Finally, I connected all of this data to plotly maps using a geo-json file usable on Choropleth maps created by GitHub user "fedhere".
#Melt Dataframe
Corona_Melt=pd.melt(Corona_csv, id_vars=['date_of_interest'], value_vars=['CASE_COUNT', 'PROBABLE_CASE_COUNT',
'HOSPITALIZED_COUNT', 'DEATH_COUNT', 'PROBABLE_DEATH_COUNT',
'CASE_COUNT_7DAY_AVG', 'ALL_CASE_COUNT_7DAY_AVG', 'HOSP_COUNT_7DAY_AVG',
'DEATH_COUNT_7DAY_AVG', 'ALL_DEATH_COUNT_7DAY_AVG', 'BX_CASE_COUNT',
'BX_HOSPITALIZED_COUNT', 'BX_DEATH_COUNT', 'BX_CASE_COUNT_7DAY_AVG',
'BX_HOSPITALIZED_COUNT_7DAY_AVG', 'BX_DEATH_COUNT_7DAY_AVG',
'BK_CASE_COUNT', 'BK_HOSPITALIZED_COUNT', 'BK_DEATH_COUNT',
'BK_CASE_COUNT_7DAY_AVG', 'BK_HOSPITALIZED_COUNT_7DAY_AVG',
'BK_DEATH_COUNT_7DAY_AVG', 'MN_CASE_COUNT', 'MN_HOSPITALIZED_COUNT',
'MN_DEATH_COUNT', 'MN_CASE_COUNT_7DAY_AVG',
'MN_HOSPITALIZED_COUNT_7DAY_AVG', 'MN_DEATH_COUNT_7DAY_AVG',
'QN_CASE_COUNT', 'QN_HOSPITALIZED_COUNT', 'QN_DEATH_COUNT',
'QN_CASE_COUNT_7DAY_AVG', 'QN_HOSPITALIZED_COUNT_7DAY_AVG',
'QN_DEATH_COUNT_7DAY_AVG', 'SI_CASE_COUNT', 'SI_HOSPITALIZED_COUNT',
'SI_DEATH_COUNT', 'SI_CASE_COUNT_7DAY_AVG',
'SI_HOSPITALIZED_COUNT_7DAY_AVG', 'SI_DEATH_COUNT_7DAY_AVG'],
var_name='Type', value_name='Number_Of_Cases')
#Functions
def Get_Borough_From_Column(s):
split_name= s["Type"].split("_") #SPlit by underscore
if split_name[0] == "SI":
return "Staten Island"
elif split_name[0] == "BX":
return "Bronx"
elif split_name[0] == "BK":
return "Brooklyn"
elif split_name[0] == "MN":
return "Manhattan"
elif split_name[0] == "QN":
return "Queens"
else:
return split_name[0]
Corona_Melt["Borough"]=Corona_Melt.apply(Get_Borough_From_Column,axis=1) #Borough Apply COlumn
def Get_Type_of_Count(s):
split_name= s["Type"].split("_")
return str(split_name[1:])
Corona_Melt["Type_Of_Count"]=Corona_Melt.apply(Get_Type_of_Count,axis=1) #Borough Apply COlumn
Corona_Melt['date_joined'] = Corona_Melt['date_of_interest'].astype('datetime64[ns]')
Corona_Melt['month_year_sold'] = pd.DatetimeIndex(Corona_Melt['date_of_interest']).strftime(" %B %Y").astype(str) #Make it a Month Year column
Corona_Melt['date_joined_str'] = Corona_Melt['date_joined'].astype(str)
The Coronavirus data had to be melted into a 3 column dataframe in order to be properly analyzed. Once this was set, two more columns were added which delineated the borough and case statistic of interest (Total Cases, Number of Deaths, 7-day average of cases etc.)
#Variable which correctly orders the dates of interest
date_order =[' November 2019', ' December 2019', ' January 2020',
' February 2020', ' March 2020', ' April 2020', ' May 2020',
' June 2020', ' July 2020', ' August 2020', ' September 2020',
' October 2020', ' November 2020',
' December 2020']
#FUnction groups by up to 2 factors then aggregates by a metric of choice
def Group_by_two_factors_df(Fact1="",text_hover="",Value="",df="",aggregator="mean"):
try:
df[Value] = Property_csv[Value].str.replace(',', '')
df[Value] = pd.to_numeric(Property_csv[Value])
except:
pass
if text_hover=="":
text_hover=Fact1
df=df.groupby(Fact1, as_index=False).agg({Value:aggregator}).reset_index(drop=True)
else:
df=df.groupby([Fact1,text_hover], as_index=False).agg({Value:aggregator}).reset_index(drop=True)
#df=df.dropna(subset=[Fact1])
#df = df.sample(frac=1).reset_index(drop=True)
return df
#Function creates a lineplot
def Ploty_Line_Plot(Fact1,Value,df,text_hover="",titler=""):
if text_hover!="":
fig = px.line(df, x=Fact1, y=Value, color=text_hover
)
else:
fig = px.line(df, x=Fact1, y=Value)
fig.update_layout(title=titler , xaxis_title=Fact1,yaxis_title=Value,
yaxis=dict(range=[min(df[Value]), max(df[Value])]))
if Fact1 == "month_year_sold_order":
fig.update_xaxes(tickangle=45,
tickmode = 'array',
tickvals = np.arange(0,len(date_order)),
ticktext= date_order)
fig.show()
#Order months by numeric on lineplot, fixes order error
def Month_Orderer_Lineplot(s):
for indexer in range(len(date_order)):
if s["month_year_sold"] == date_order[indexer]:
return indexer
#Function plots over NYC zip codes
def Plotly_Zip_Map(color_column,hover_text,df,log_scale=False,slider_text="",titler="",range_of_map=[5.5,8.5]):
if log_scale == False:
color_scaler=df[color_column]
elif log_scale == True:
color_scaler=np.log10(df[color_column])
if slider_text == "":
fig = px.choropleth(df, geojson=NY_Data,
color=color_scaler ,
hover_data=[hover_text],
locations="ZIP CODE", featureidkey="properties.postalCode"
,projection="mercator"
)
fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(title=titler)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
else:
fig = px.choropleth(df, geojson=NY_Data,
color=color_scaler ,
hover_data=[hover_text],
animation_frame=slider_text,
category_orders={slider_text: date_order}
, range_color=range_of_map
,locations="ZIP CODE", featureidkey="properties.postalCode"
,projection="mercator"
)
fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(title=titler)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
#Format as dollar amount
def as_currency(amount):
if amount >= 0:
return '${:,.2f}'.format(amount)
else:
return '-${:,.2f}'.format(-amount)
# creates a scatterplot with trendlines, can set range
def Plotly_Scatter_Plot(Fact1,Value,df,text_hover="",Color_Titler="",titler="",range_of_map=[0,0]):
if Color_Titler != "":
fig = px.scatter(df, x=df[Fact1], y=df[Value], color=df[Color_Titler]
, color_discrete_map = constant_color_assigner(df[Color_Titler])
,hover_data=[df[text_hover]],trendline="ols" )
else:
fig = px.scatter(df, x=df[Fact1], y=df[Value]
,hover_data=[df[text_hover]],trendline="ols" )
#fig.add_traces(go.Scatter(x=df[Fact1], y=df[Value], name='Regression Fit'))
# regression
"""
reg = RidgeClassifier().fit(np.vstack(df[Fact1]), df[Value])
df['bestfit'] = reg.predict(np.vstack(df[Fact1]))
# plotly figure setup
fig.add_trace(go.Scatter(name='line of best fit', x=df[Fact1], y=df['bestfit'], mode='lines'))
"""
if range_of_map == [0,0]:
range_of_map=[min(df[Value]), max(df[Value])]
fig.update_layout(title=titler , xaxis_title=Fact1,yaxis_title=Value,
yaxis=dict(range=range_of_map))
fig.show()
#Give each category the same color every graph
def constant_color_assigner(column):
COLOR_DICT={}
color_lister = ['yellow','blue','green','red','orange','purple','pink']
color_lister = color_lister+color_lister
col_uniques = column.unique()
col_uniques.sort() #Get constant order for these values
for val in range(len(col_uniques)): #Iterate through list getting unique index each time
COLOR_DICT[col_uniques[val]] = color_lister[val]
if val%7==0:
color_lister += color_lister
return COLOR_DICT
#apply dollar signs to column
def Dollar_Apply(s):
return as_currency(s[" SALE PRICE "])
#make melted dataframe have borough column
def Get_Borough_From_Column(s):
split_name= s["Type"].split("_") #SPlit by underscore
if split_name[0] == "SI":
return "Staten Island"
elif split_name[0] == "BX":
return "Bronx"
elif split_name[0] == "BK":
return "Brooklyn"
elif split_name[0] == "MN":
return "Manhattan"
elif split_name[0] == "QN":
return "Queens"
else:
return split_name[0]
#column for melted dataframe that shows type of case
def Get_Type_of_Count(s):
split_name= s["Type"].split("_")
return str(split_name[1:])
#Scatterplot that plots the corona data by the sales data
def Month_Year_Cases_vs_Sale_Scatter(Time_agg="month_year_sold",Corona_Melt=Corona_Melt,Property_csv=Property_csv,df_corona_agg="mean",df_sale_agg="mean",fact_corona="",fact_sale="",titler="",range_of_map=[0,0]):
Corona_Melt=Corona_Melt[Corona_Melt["Type_Of_Count"]==fact_corona]
Corona_Melt_agg_Month = Group_by_two_factors_df(Fact1="Borough",text_hover=Time_agg,Value="Number_Of_Cases",df=Corona_Melt,aggregator=df_corona_agg)
Borough_agg_month = Group_by_two_factors_df(Fact1="Borough",text_hover=Time_agg,Value=fact_sale,df=Property_csv,aggregator=df_sale_agg)
new_df = pd.merge(Borough_agg_month, Corona_Melt_agg_Month, how='left', left_on=['Borough',Time_agg]
, right_on = ['Borough',Time_agg])
Plotly_Scatter_Plot(Fact1= "Number_Of_Cases",Value = fact_sale,df=new_df,text_hover=Time_agg,Color_Titler="Borough",titler=titler,range_of_map=range_of_map)
# Groupby and plot lineplots
def Lineplot_Month_Order_Fixer(df,Line_Factor,Value,titler="",aggregatorer="mean"):
df = Group_by_two_factors_df(Fact1="month_year_sold",text_hover=Line_Factor,Value=Value,df=df,aggregator=aggregatorer)
df["month_year_sold_order"]= df.apply(Month_Orderer_Lineplot,axis=1)
df=df.sort_values(by="month_year_sold_order")
Ploty_Line_Plot(Fact1="month_year_sold_order",Value=Value,df=df,text_hover=Line_Factor
,titler=titler)
# Groupby and plot choropleth maps by ZIPCODE
def Group_By_And_Zip_Map(Value,text_hover="",slider_text="",aggregator="mean",titler="",log_scale=False,df=Property_csv,range_of_map=[1800,2020]):
if text_hover == "":
text_hover = Value
zipcodes_agg = Group_by_two_factors_df(Fact1="ZIP CODE",Value=Value,text_hover=slider_text,df=df,aggregator=aggregator)
Plotly_Zip_Map(color_column=Value,hover_text=text_hover,slider_text=slider_text,df=zipcodes_agg,log_scale=log_scale,titler=titler,range_of_map=range_of_map)
Using the above functions I would group the data by date or month and the value of interest into dataframes, then plot those aggregated data points onto maps or line, bar, and scatter plots.
Group_By_And_Zip_Map(Value=" SALE PRICE ",aggregator="sum",slider_text="",log_scale=True,titler="Total Sold Properties in Dollars by Zip Code",range_of_map=[5.5,9])
Group_By_And_Zip_Map(Value=" SALE PRICE ",aggregator="mean",slider_text="month_year_sold",log_scale=True,titler="Average Price by Zip Code",range_of_map=[5.5,8])